Data

This TB hackday script uses (pre-processed) RNA sequencing data from the following study to identify granuloma-associated T cell genes:

  1. Foreman et al. 2023 (J Exp Med) CD30 co-stimulation drives differentiation of protective T cells during Mycobacterium tuberculosis infection

Then the expression of these genes can be further examined and validated using pseudo-bulk transcriptomic data from scRNAseq data in this study:

  1. Bromley et al. 2024 (Immunity) CD4+ T cells re-wire granuloma cellularity and regulatory networks to promote immunomodulation following Mtb reinfection

Background

Foreman et al. compared the gene expression of T cells isolated from PBMC vs. T cells from granulomas (n=4 NHP, n=23 granulomas). They identified genes that were associated with granuloma T cells, and specifically identied genes that were correlated with Mtb burden in the granuloma (CFU).

Foreman et al. 2023 Granuloma-associated T cell genes

Hypotheses for hacking

Setup R and load the data.

Load relevant packages. Change <data_dir> variable as appropriate.

Load the pre-processed RNA sequencing data. There are two important files:

  1. foreman_et_al_counts.csv contains log2-transformed counts per million (log2-CPM), that were computed from raw counts by the study authors. The table contains 84 columns, with one column gene_id and the remaining columns matching sampleids in the metadata. There are 30,689 genes in the dataset.

  2. foreman_etal_meta.csv contains all the 83 sample- and granuloma-level metadata that is available for these samples and animals. There are 27 CD8+ T cell samples sorted from granulomas and 31 CD4+ T cell samples sorted from granulomas. Other variables include subject, sort, condition, sex, and granuloma CFU.

library(readr)
library(tidyverse)
library(edgeR)
library(kimma)
library(ggplot2)
library(dplyr)
library(ggrepel)

# NOTE --- REPLACE the <data_dir> FOLDER DESTINTATION AS APPROPRIATE

data_dir <- '/fh/fast/gilbert_p/fg_data/SEATRAC/TB_hackday_2024/processed_data'

# These are already log2-CPM
ncts <- readr::read_csv(file.path(data_dir, "foreman_etal_counts.csv"))
meta <- readr::read_csv(file.path(data_dir, "foreman_etal_meta.csv"))

Prepare the data for differential gene expression analysis

The sampleid columns of the ncts variable and the rows of sampleid in the meta variable match. For this first analysis we will focus on the CD8 T cells sorted from PBMC or granulomas, creating subset tables indicated by _ss variables. Then we initialize the DGEList object and create a limma voom model with a design matrix to identify genes that are associated with granuloma Mtb burden (CFU).

In the accompanying “mean-variace” plot the x-axis represents the average expression levels of genes across all samples. The y-axis represents the square-root of the variance (i.e., standard deviation) of gene expression levels. It shows how the variance changes with respect to the mean expression. Every dot is a gene and the trend line shows the relationship between the mean and the variance. Note that the variance is relatively stable across expression levels and the relationship is smooth; this is good for analysis and voom will use this relationship to adjust the model fits of each gene. If you re-run the code block without the filtering you will see the impact on the mean-variance plot.

meta_ss = meta %>% filter(condition == "CD8_gran")
keep_ids = meta_ss %>% pull(sampleid)
keep_ids = c('gene_id', keep_ids)

ncts_ss = ncts %>% select(any_of(keep_ids))

# Discard genes that have low counts/prevalence
filter = rowSums(ncts_ss > 1) >= (0.5 * ncol(ncts_ss))
ncts_ss = ncts_ss[filter, ]

# Create the object for differential expression testing
dge_o = DGEList(counts=ncts_ss,
                genes=ncts_ss[, 1],
                samples=meta_ss,
                group=meta_ss[['CFU']])
## Setting first column of `counts` as gene annotation.
# Compute weights
# ncts_ss <- calcNormFactors(dge_o, method = "TMM")

# Specify the model/design matrix
design_temp = model.matrix(~CFU, data=meta_ss)

# Create the voom object and fit the model
v <- voom(dge_o, design=design_temp, plot=TRUE)

Fit the model to identify genes associated with protection

# vvwts <- voomWithQualityWeights(dge_o, design=design_temp, normalize.method="none", plot=TRUE)
fit = lmFit(v, design_temp)

# Estimate contrasts and p-values
fit = eBayes(fit, robust=TRUE)

summary(decideTests(fit, adjust.method="fdr", p.value = 0.05))
##        (Intercept)   CFU
## Down             0     0
## NotSig           0 11372
## Up           11372     0
results <- topTable(fit, adjust="BH", coef="CFU", p.value=1, number=Inf, resort.by="P")

head(results %>% select(gene_id, logFC, AveExpr, P.Value, adj.P.Val), 20)
##            gene_id         logFC  AveExpr      P.Value adj.P.Val
## 7215       OSGEPL1 -7.551531e-05 6.045068 2.905859e-05 0.1676882
## 1970    CYHYorf15B -9.964047e-05 5.653331 2.949141e-05 0.1676882
## 9554          STX3 -8.007373e-05 5.338798 9.319884e-05 0.3532857
## 11186       ZNF500 -9.459217e-05 5.294934 1.536120e-04 0.4367190
## 1285          CCL1 -1.038526e-04 5.823269 1.956488e-04 0.4449837
## 4293  LOC100424917 -8.809948e-05 5.679183 3.059415e-04 0.4999033
## 10788       WDSUB1 -8.347674e-05 5.396688 3.077139e-04 0.4999033
## 3958        KDELC2 -8.726556e-05 5.084265 4.218371e-04 0.5668298
## 1840         CRIP2 -8.841231e-05 5.173913 4.735072e-04 0.5668298
## 2529         ENPP5 -8.621299e-05 5.319643 5.449366e-04 0.5668298
## 4284  LOC100424303 -8.519118e-05 5.508063 5.482877e-04 0.5668298
## 3181          GMPR -1.036991e-04 5.419739 7.558196e-04 0.6929904
## 7871        PRDM10 -5.962352e-05 5.427453 8.439460e-04 0.6929904
## 9732        TBXA2R -7.348790e-05 5.632555 8.660223e-04 0.6929904
## 539          ARMT1 -8.102035e-05 5.841423 9.140745e-04 0.6929904
## 9328        SORBS1 -7.333526e-05 4.905782 1.047444e-03 0.7444706
## 9477       ST3GAL5 -8.008782e-05 5.740576 1.213585e-03 0.7723764
## 4286  LOC100424418 -8.122010e-05 4.955814 1.349849e-03 0.7723764
## 8622       RPS6KA4 -6.413657e-05 5.826126 1.405609e-03 0.7723764
## 3797          INVS -5.160037e-05 5.742617 1.467927e-03 0.7723764

Create a volcano plot for single-gene association with protection

# Add a column for significance based on FDR
results <- results %>%
  mutate(Significance = ifelse(adj.P.Val < 0.2, "Significant", "Not Significant"))

# Select the top 10 genes based on adjusted p-value for labeling
top_genes <- results %>%
  arrange(adj.P.Val) %>%
  slice_head(n = 10)

max_logFC <- max(abs(results$logFC), na.rm = TRUE)

# Create the volcano plot
volcano_plot <- ggplot(results, aes(x = logFC, y = -log10(P.Value))) +
  geom_point(aes(color = Significance), alpha = 0.6) +
  scale_color_manual(values = c("Significant" = "red", "Not Significant" = "grey")) +
  geom_text_repel(data = top_genes,
                  aes(label = gene_id),
                  max.overlaps = 10,
                  box.padding = 0.3,
                  point.padding = 0.3,
                  segment.color = "grey50",
                  size = 3) +
  xlim(c(-max_logFC, max_logFC)) +
  theme_minimal() +
  labs(
    x = "log2 Fold-change",
    y = "-log10 P-value",
    color = "FDR < 0.05") +
  theme(plot.title = element_text(hjust = 0.5))

volcano_plot
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Redo the analysis using a mixed-effects model to incorporate data from additional visits and account for the longitudinal design

# Redo the analysis using a mixed-effects model to account for the longitudinal design

dge_o = DGEList(counts=ncts_ss,
                genes=ncts_ss[, 1],
                samples=meta_ss,
                group=meta_ss[['CFU']])
## Setting first column of `counts` as gene annotation.
ncts_ss <- calcNormFactors(dge_o, method = "TMM")

design_temp=model.matrix(~CFU, data=meta_ss)

v <- voom(dge_o, design=design_temp, plot=FALSE)

# Can't figure out why I get this error here
# lme/lmerel model: expression~protect_outcome+sex+visit+(1|animalid)
# Error in `[.data.frame`(weights.format, order(rownames(weights.format)),  : 
#                           undefined columns selected

#klm <- kmFit(dat = v,
#             model = "~CFU + (1|subject)",
#             run_lme = TRUE,
#             libraryID="sampleid",
#             patientID="subject",
#             use_weights = TRUE,
#             metrics = TRUE,
#             run_contrast = FALSE,
#             processors=1)

#summarise_kmFit(fdr = klm$lm)

#plot_volcano(model_result = klm, 
#             model = "lme", variables = "CFU",
#             y_cutoff = 0.05)